diff --git a/.gitignore b/.gitignore index 938d8e3f..d7b58315 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,13 @@ secrets/pwd_api.txt secrets/username_api.txt +secrets/username_api.txt +secrets/* documents-experts/ +llm/ cc-bio.json *.xlsx coverage_re + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docker-compose.yml b/docker-compose.yml index b28dfbc2..af3b0666 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -125,6 +125,50 @@ services: logging: # no logs for postgres container driver: none + + llm: + ports: + - 5555:5555 + build: + context: ./ + dockerfile: Dockerfile_api_import + entrypoint: ["python", "quotaclimat/sentiment/enrich_with_llm.py"] + environment: + ENV: docker # change me to prod for real cases + LOGLEVEL: DEBUG # Change me to info (debug, info, warning, error) to have less log + PYTHONPATH: /app + POSTGRES_USER: user + POSTGRES_DB: barometre + POSTGRES_PASSWORD: password + POSTGRES_HOST: postgres_db + POSTGRES_PORT: 5432 + PORT_HS: 5555 # healthcheck + HEALTHCHECK_SERVER: "0.0.0.0" + MEDIATREE_USER : /run/secrets/username_api + MEDIATREE_PASSWORD: /run/secrets/pwd_api + SCW_SECRET: /run/secrets/scw_api + SCW_API_URL: /run/secrets/scw_api_url + MEDIATREE_AUTH_URL: https://keywords.mediatree.fr/api/auth/token/ + KEYWORDS_URL: https://keywords.mediatree.fr/api/subtitle/ # https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList + MODIN_ENGINE: ray + MODIN_CPUS: 4 # "https://modin.readthedocs.io/en/0.11.0/using_modin.html#reducing-or-limiting-the-resources-modin-can-use" + MODIN_MEMORY: 1000000000 # 1Gb + RAY_memory_usage_threshold: 1 + mem_limit: "1G" + volumes: + - ./quotaclimat/:/app/quotaclimat/ + - ./llm/:/app/llm/ + - ./postgres/:/app/postgres/ + - ./test/:/app/test/ + secrets: + - pwd_api + - username_api + - scw_api + - scw_api_url + depends_on: + postgres_db: + condition: service_healthy + mediatree: ports: - 5050:5050 @@ -156,6 +200,7 @@ services: CHANNEL : fr3-idf # to reimport only one channel MEDIATREE_USER : /run/secrets/username_api MEDIATREE_PASSWORD: /run/secrets/pwd_api + SCW_SECRET: /run/secrets/scw_api MEDIATREE_AUTH_URL: https://keywords.mediatree.fr/api/auth/token/ KEYWORDS_URL: https://keywords.mediatree.fr/api/subtitle/ # https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList MODIN_ENGINE: ray @@ -170,6 +215,7 @@ services: secrets: - pwd_api - username_api + - scw_api depends_on: nginxtest: condition: service_healthy @@ -200,4 +246,8 @@ secrets: # https://docs.docker.com/compose/use-secrets/ pwd_api: file: secrets/pwd_api.txt username_api: - file: secrets/username_api.txt \ No newline at end of file + file: secrets/username_api.txt + scw_api: + file: secrets/scw_api.txt + scw_api_url: + file: secrets/scw_api_url.txt \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 47ec2f1d..f6798622 100644 --- a/poetry.lock +++ b/poetry.lock @@ -170,6 +170,27 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] +[[package]] +name = "anyio" +version = "4.7.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.9" +files = [ + {file = "anyio-4.7.0-py3-none-any.whl", hash = "sha256:ea60c3723ab42ba6fff7e8ccb0488c898ec538ff4df1f1d5e642c3601d07e352"}, + {file = "anyio-4.7.0.tar.gz", hash = "sha256:2f834749c602966b7d456a7567cafcb309f96482b5081d14ac93ccd457f9dd48"}, +] + +[package.dependencies] +idna = ">=2.8" +sniffio = ">=1.1" +typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} + +[package.extras] +doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx_rtd_theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "truststore (>=0.9.1)", "uvloop (>=0.21)"] +trio = ["trio (>=0.26.1)"] + [[package]] name = "asyncio" version = "3.4.3" @@ -834,6 +855,17 @@ files = [ {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, ] +[[package]] +name = "distro" +version = "1.9.0" +description = "Distro - an OS platform information API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, + {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, +] + [[package]] name = "dulwich" version = "0.21.7" @@ -1204,6 +1236,62 @@ files = [ docs = ["Sphinx", "furo"] test = ["objgraph", "psutil"] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "httpcore" +version = "1.0.7" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd"}, + {file = "httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<1.0)"] + +[[package]] +name = "httpx" +version = "0.28.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.28.0-py3-none-any.whl", hash = "sha256:dc0b419a0cfeb6e8b34e85167c0da2671206f5095f1baa9663d23bcfd6b535fc"}, + {file = "httpx-0.28.0.tar.gz", hash = "sha256:0858d3bab51ba7e386637f22a61d8ccddaeec5f3fe4209da3a6168dbb91573e0"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +httpcore = "==1.*" +idna = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +zstd = ["zstandard (>=0.18.0)"] + [[package]] name = "hyperlink" version = "21.0.0" @@ -1393,6 +1481,90 @@ files = [ test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] trio = ["async_generator", "trio"] +[[package]] +name = "jiter" +version = "0.8.0" +description = "Fast iterable JSON parser." +optional = false +python-versions = ">=3.8" +files = [ + {file = "jiter-0.8.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:dee4eeb293ffcd2c3b31ebab684dbf7f7b71fe198f8eddcdf3a042cc6e10205a"}, + {file = "jiter-0.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aad1e6e9b01cf0304dcee14db03e92e0073287a6297caf5caf2e9dbfea16a924"}, + {file = "jiter-0.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:504099fb7acdbe763e10690d560a25d4aee03d918d6a063f3a761d8a09fb833f"}, + {file = "jiter-0.8.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2373487caad7fe39581f588ab5c9262fc1ade078d448626fec93f4ffba528858"}, + {file = "jiter-0.8.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c341ecc3f9bccde952898b0c97c24f75b84b56a7e2f8bbc7c8e38cab0875a027"}, + {file = "jiter-0.8.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e48e7a336529b9419d299b70c358d4ebf99b8f4b847ed3f1000ec9f320e8c0c"}, + {file = "jiter-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5ee157a8afd2943be690db679f82fafb8d347a8342e8b9c34863de30c538d55"}, + {file = "jiter-0.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d7dceae3549b80087f913aad4acc2a7c1e0ab7cb983effd78bdc9c41cabdcf18"}, + {file = "jiter-0.8.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e29e9ecce53d396772590438214cac4ab89776f5e60bd30601f1050b34464019"}, + {file = "jiter-0.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fa1782f22d5f92c620153133f35a9a395d3f3823374bceddd3e7032e2fdfa0b1"}, + {file = "jiter-0.8.0-cp310-none-win32.whl", hash = "sha256:f754ef13b4e4f67a3bf59fe974ef4342523801c48bf422f720bd37a02a360584"}, + {file = "jiter-0.8.0-cp310-none-win_amd64.whl", hash = "sha256:796f750b65f5d605f5e7acaccc6b051675e60c41d7ac3eab40dbd7b5b81a290f"}, + {file = "jiter-0.8.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f6f4e645efd96b4690b9b6091dbd4e0fa2885ba5c57a0305c1916b75b4f30ff6"}, + {file = "jiter-0.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f61cf6d93c1ade9b8245c9f14b7900feadb0b7899dbe4aa8de268b705647df81"}, + {file = "jiter-0.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0396bc5cb1309c6dab085e70bb3913cdd92218315e47b44afe9eace68ee8adaa"}, + {file = "jiter-0.8.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62d0e42ec5dc772bd8554a304358220be5d97d721c4648b23f3a9c01ccc2cb26"}, + {file = "jiter-0.8.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec4b711989860705733fc59fb8c41b2def97041cea656b37cf6c8ea8dee1c3f4"}, + {file = "jiter-0.8.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:859cc35bf304ab066d88f10a44a3251a9cd057fb11ec23e00be22206db878f4f"}, + {file = "jiter-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5000195921aa293b39b9b5bc959d7fa658e7f18f938c0e52732da8e3cc70a278"}, + {file = "jiter-0.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:36050284c0abde57aba34964d3920f3d6228211b65df7187059bb7c7f143759a"}, + {file = "jiter-0.8.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a88f608e050cfe45c48d771e86ecdbf5258314c883c986d4217cc79e1fb5f689"}, + {file = "jiter-0.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:646cf4237665b2e13b4159d8f26d53f59bc9f2e6e135e3a508a2e5dd26d978c6"}, + {file = "jiter-0.8.0-cp311-none-win32.whl", hash = "sha256:21fe5b8345db1b3023052b2ade9bb4d369417827242892051244af8fae8ba231"}, + {file = "jiter-0.8.0-cp311-none-win_amd64.whl", hash = "sha256:30c2161c5493acf6b6c3c909973fb64ae863747def01cc7574f3954e0a15042c"}, + {file = "jiter-0.8.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:d91a52d8f49ada2672a4b808a0c5c25d28f320a2c9ca690e30ebd561eb5a1002"}, + {file = "jiter-0.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c38cf25cf7862f61410b7a49684d34eb3b5bcbd7ddaf4773eea40e0bd43de706"}, + {file = "jiter-0.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6189beb5c4b3117624be6b2e84545cff7611f5855d02de2d06ff68e316182be"}, + {file = "jiter-0.8.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e13fa849c0e30643554add089983caa82f027d69fad8f50acadcb21c462244ab"}, + {file = "jiter-0.8.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d7765ca159d0a58e8e0f8ca972cd6d26a33bc97b4480d0d2309856763807cd28"}, + {file = "jiter-0.8.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1b0befe7c6e9fc867d5bed21bab0131dfe27d1fa5cd52ba2bced67da33730b7d"}, + {file = "jiter-0.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7d6363d4c6f1052b1d8b494eb9a72667c3ef5f80ebacfe18712728e85327000"}, + {file = "jiter-0.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a873e57009863eeac3e3969e4653f07031d6270d037d6224415074ac17e5505c"}, + {file = "jiter-0.8.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2582912473c0d9940791479fe1bf2976a34f212eb8e0a82ee9e645ac275c5d16"}, + {file = "jiter-0.8.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:646163201af42f55393ee6e8f6136b8df488253a6533f4230a64242ecbfe6048"}, + {file = "jiter-0.8.0-cp312-none-win32.whl", hash = "sha256:96e75c9abfbf7387cba89a324d2356d86d8897ac58c956017d062ad510832dae"}, + {file = "jiter-0.8.0-cp312-none-win_amd64.whl", hash = "sha256:ed6074552b4a32e047b52dad5ab497223721efbd0e9efe68c67749f094a092f7"}, + {file = "jiter-0.8.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:dd5e351cb9b3e676ec3360a85ea96def515ad2b83c8ae3a251ce84985a2c9a6f"}, + {file = "jiter-0.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ba9f12b0f801ecd5ed0cec29041dc425d1050922b434314c592fc30d51022467"}, + {file = "jiter-0.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7ba461c3681728d556392e8ae56fb44a550155a24905f01982317b367c21dd4"}, + {file = "jiter-0.8.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a15ed47ab09576db560dbc5c2c5a64477535beb056cd7d997d5dd0f2798770e"}, + {file = "jiter-0.8.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cef55042816d0737142b0ec056c0356a5f681fb8d6aa8499b158e87098f4c6f8"}, + {file = "jiter-0.8.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:549f170215adeb5e866f10617c3d019d8eb4e6d4e3c6b724b3b8c056514a3487"}, + {file = "jiter-0.8.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f867edeb279d22020877640d2ea728de5817378c60a51be8af731a8a8f525306"}, + {file = "jiter-0.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:aef8845f463093799db4464cee2aa59d61aa8edcb3762aaa4aacbec3f478c929"}, + {file = "jiter-0.8.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:d0d6e22e4062c3d3c1bf3594baa2f67fc9dcdda8275abad99e468e0c6540bc54"}, + {file = "jiter-0.8.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:079e62e64696241ac3f408e337aaac09137ed760ccf2b72b1094b48745c13641"}, + {file = "jiter-0.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74d2b56ed3da5760544df53b5f5c39782e68efb64dc3aa0bba4cc08815e6fae8"}, + {file = "jiter-0.8.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:798dafe108cba58a7bb0a50d4d5971f98bb7f3c974e1373e750de6eb21c1a329"}, + {file = "jiter-0.8.0-cp313-none-win32.whl", hash = "sha256:ca6d3064dfc743eb0d3d7539d89d4ba886957c717567adc72744341c1e3573c9"}, + {file = "jiter-0.8.0-cp313-none-win_amd64.whl", hash = "sha256:38caedda64fe1f04b06d7011fc15e86b3b837ed5088657bf778656551e3cd8f9"}, + {file = "jiter-0.8.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:bb5c8a0a8d081c338db22e5b8d53a89a121790569cbb85f7d3cfb1fe0fbe9836"}, + {file = "jiter-0.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:202dbe8970bfb166fab950eaab8f829c505730a0b33cc5e1cfb0a1c9dd56b2f9"}, + {file = "jiter-0.8.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9046812e5671fdcfb9ae02881fff1f6a14d484b7e8b3316179a372cdfa1e8026"}, + {file = "jiter-0.8.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e6ac56425023e52d65150918ae25480d0a1ce2a6bf5ea2097f66a2cc50f6d692"}, + {file = "jiter-0.8.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7dfcf97210c6eab9d2a1c6af15dd39e1d5154b96a7145d0a97fa1df865b7b834"}, + {file = "jiter-0.8.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4e3c8444d418686f78c9a547b9b90031faf72a0a1a46bfec7fb31edbd889c0d"}, + {file = "jiter-0.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6507011a299b7f578559084256405a8428875540d8d13530e00b688e41b09493"}, + {file = "jiter-0.8.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0aae4738eafdd34f0f25c2d3668ce9e8fa0d7cb75a2efae543c9a69aebc37323"}, + {file = "jiter-0.8.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:7f5d782e790396b13f2a7b36bdcaa3736a33293bdda80a4bf1a3ce0cd5ef9f15"}, + {file = "jiter-0.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:cc7f993bc2c4e03015445adbb16790c303282fce2e8d9dc3a3905b1d40e50564"}, + {file = "jiter-0.8.0-cp38-none-win32.whl", hash = "sha256:d4a8a6eda018a991fa58ef707dd51524055d11f5acb2f516d70b1be1d15ab39c"}, + {file = "jiter-0.8.0-cp38-none-win_amd64.whl", hash = "sha256:4cca948a3eda8ea24ed98acb0ee19dc755b6ad2e570ec85e1527d5167f91ff67"}, + {file = "jiter-0.8.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ef89663678d8257063ce7c00d94638e05bd72f662c5e1eb0e07a172e6c1a9a9f"}, + {file = "jiter-0.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c402ddcba90b4cc71db3216e8330f4db36e0da2c78cf1d8a9c3ed8f272602a94"}, + {file = "jiter-0.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a6dfe795b7a173a9f8ba7421cdd92193d60c1c973bbc50dc3758a9ad0fa5eb6"}, + {file = "jiter-0.8.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8ec29a31b9abd6be39453a2c45da067138a3005d65d2c0507c530e0f1fdcd9a4"}, + {file = "jiter-0.8.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a488f8c54bddc3ddefaf3bfd6de4a52c97fc265d77bc2dcc6ee540c17e8c342"}, + {file = "jiter-0.8.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aeb5561adf4d26ca0d01b5811b4d7b56a8986699a473d700757b4758ef787883"}, + {file = "jiter-0.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ab961858d7ad13132328517d29f121ae1b2d94502191d6bcf96bddcc8bb5d1c"}, + {file = "jiter-0.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a207e718d114d23acf0850a2174d290f42763d955030d9924ffa4227dbd0018f"}, + {file = "jiter-0.8.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:733bc9dc8ff718a0ae4695239e9268eb93e88b73b367dfac3ec227d8ce2f1e77"}, + {file = "jiter-0.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d1ec27299e22d05e13a06e460bf7f75f26f9aaa0e0fb7d060f40e88df1d81faa"}, + {file = "jiter-0.8.0-cp39-none-win32.whl", hash = "sha256:e8dbfcb46553e6661d3fc1f33831598fcddf73d0f67834bce9fc3e9ebfe5c439"}, + {file = "jiter-0.8.0-cp39-none-win_amd64.whl", hash = "sha256:af2ce2487b3a93747e2cb5150081d4ae1e5874fce5924fc1a12e9e768e489ad8"}, + {file = "jiter-0.8.0.tar.gz", hash = "sha256:86fee98b569d4cc511ff2e3ec131354fafebd9348a487549c31ad371ae730310"}, +] + [[package]] name = "jmespath" version = "1.0.1" @@ -2042,6 +2214,30 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "openai" +version = "1.57.0" +description = "The official Python library for the openai API" +optional = false +python-versions = ">=3.8" +files = [ + {file = "openai-1.57.0-py3-none-any.whl", hash = "sha256:972e36960b821797952da3dc4532f486c28e28a2a332d7d0c5407f242e9d9c39"}, + {file = "openai-1.57.0.tar.gz", hash = "sha256:76f91971c4bdbd78380c9970581075e0337b5d497c2fbf7b5255078f4b31abf9"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +jiter = ">=0.4.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.11,<5" + +[package.extras] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] + [[package]] name = "openpyxl" version = "3.1.5" @@ -3574,6 +3770,17 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + [[package]] name = "soupsieve" version = "2.6" @@ -4177,4 +4384,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<=3.13" -content-hash = "9d2d68cd41d00732bca15175938452e8c4061bfb2c58d9f4cae1d40905cd86b8" +content-hash = "b754d84a9a0e193dccd77626af936072ce18ae8d451026e8b439ec16ad2e7570" diff --git a/pyproject.toml b/pyproject.toml index 5e74b64b..92d8b109 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ sentry-sdk = "^2.13.0" modin = {extras = ["ray"], version = "^0.32.0"} openpyxl = "^3.1.5" requests = "^2.32.3" +openai = "^1.57.0" [build-system] requires = ["poetry-core>=1.1"] build-backend = "poetry.core.masonry.api" diff --git a/quotaclimat/sentiment/enrich_with_llm.py b/quotaclimat/sentiment/enrich_with_llm.py new file mode 100644 index 00000000..2dd76d64 --- /dev/null +++ b/quotaclimat/sentiment/enrich_with_llm.py @@ -0,0 +1,219 @@ +from openai import OpenAI +import os +import logging +import csv +import json +import asyncio + +from postgres.schemas.models import create_tables, get_sitemap, connect_to_db +from postgres.insert_data import save_to_pg +from typing import List +from dataclasses import dataclass +from asyncio import Semaphore +import pandas as pd +import glob + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", + level=logging.INFO, +) + +# Semaphore(5) gives 30 out of 1000 :429 errors +semaphore = Semaphore(4) # Limit the number of concurrent API calls + +# get from env variable +SCW_SECRET_KEY = os.getenv("SCW_SECRET") +SCW_API_URL = os.getenv("SCW_API_URL") +# https://www.scaleway.com/en/docs/ai-data/generative-apis/reference-content/rate-limits/ +client = OpenAI( + base_url=SCW_API_URL, # # Scaleway's Generative APIs service URL + api_key=SCW_SECRET_KEY # Your unique API secret key from Scaleway +) + +class Claim: + def __init__(self, claim:str, note:int, sentiment:str, theme:str,keyword_id:str): + self.claim = claim + self.note = note + self.sentiment = sentiment + self.theme = theme + self.keyword_id = keyword_id + +def read_csv(file_path: str)-> list: + logging.info("Reading csv") + data = [] + with open(file_path, "r") as file: + reader = csv.DictReader(file) + for row in reader: + data.append(row) + + return data + +def save_to_csv(file_path: str, data: list[Claim]): + logging.warning(f"Saving into csv {len(data)} claims") + with open(file_path, "w") as file: + writer = csv.writer(file) + writer.writerow(["claim", "note", "sentiment", "theme", "keyword_id"]) + for claim in data: + writer.writerow([claim.claim, claim.note, claim.sentiment, claim.theme, claim.keyword_id]) + +async def get_claim_sentiment_async(plaintext: str, keyword: str): + prompt = f""" + Tu es un expert des enjeux climat, énergie, et crise de la biodiversité. + Les contenus ci-dessous proviennent de transcription imparfaite en Speech to text de contenus médiatique Télévision ou Radio. + Dans cet extrait, il est fait mention du mot clé {keyword}. + Identifie les différentes claims relatives autour de ce mot clé ({keyword}), et attribue une note de 1 à 5 + 1 étant un contenu positif à l'égard des écologistes, par exemple soutenant l'importance de défendre la cause écologique. + 3 étant un contenu neutre, présentant un invité par exemple ou le fait que des écologistes se soient exprimés. + 5 étant un contenu à charge, diffamant, voire criminalisant, par exemple dramatisant des actions violentes, ou expliquant que l'écologie est une religion, voire un risque démocratique. + En rajoutant le sentiment principal autour de l'utilisation de ce mot clé avec le champs "sentiment". + Et en rajoutant le theme principal de la claim avec le champs "theme". + Tu ne réponds que sur ce format json sans aucun commentaire car c'est un programme python qui lit la réponse, et en échappant les caractères spéciaux dans l'objet "claims" comme les guilletmets avec \" : + {{ + "claims": [ + {{ + "claim": "les écologistes veulent la décroissance et le déclin de la civilisation", + "note": 5, + "sentiment": "négatif", + "theme": "décroissance" + }}, + {{ + "claim": "les écologistes violent qui détruisent des méga bassines", + "note": 5, + "sentiment": "ironique", + "theme": "violence" + }}, + {{ + "claim": "vous êtes représentante des écologistes", + "note": 3, + "sentiment": "neutre", + "theme": "politique" + }}, + {{ + "claim": "les écologistes qui se battent pour la défense de l'environnement et la protection de la santé de nos concitoyens", + "note": 1, + "sentiment": "positif", + "theme": "militantisme" + }}, + {{ + "claim": "l'écologiste Marie Toussaint estime que le discours d'Emmanuel Macron promeut un \"grand productivisme vert\" qui ignore les limites de la planète", + "note": 1, + "sentiment": "négatif", + "theme": "écologie" + }} + ] + }} + """ + try: + async with semaphore: + completion = await asyncio.to_thread( + client.chat.completions.create, + model="llama-3.1-70b-instruct", + messages=[ + {"role": "system", "content": prompt}, + {"role": "user", "content": f"le mot clé est {keyword} et l'extrait à analyser {plaintext}"} + ], + temperature=0.7, + max_tokens=500 + ) + logging.info(f"API call completed for {plaintext[0:20]}...") + return completion.choices[0].message.content + except Exception as e: + logging.info(f"Error during API call: {e}") + return None + +async def process_batch(batch: List[dict], keyword: str) -> List[Claim]: + """Processes a batch of rows with parallel API calls.""" + tasks = [ + asyncio.create_task( + get_claim_sentiment_async(row["plaintext"], keyword) + ) + for row in batch + ] + results = await asyncio.gather(*tasks) + + claims = [] + for response, row in zip(results, batch): + if response: + claims.extend(parse_json_response_llm_to_claim(response, row["id"])) + return claims + + +async def process_batches(data: List[dict], batch_size: int, keyword: str, output_dir: str): + """Processes data in batches, saving the results to separate CSV files.""" + total_len = len(data) + for start_index in range(0, len(data), batch_size): + batch = data[start_index:start_index + batch_size] + logging.info(f"Processing batch {start_index} to {start_index + len(batch)}... out of {total_len}") + claims = await process_batch(batch, keyword) + output_file = f"{output_dir}/claims-{start_index}.csv" + save_to_csv(output_file, claims) + logging.info(f"Batch {start_index} saved with {len(claims)} claims.") + +### +# { +# "claims": [ +# { +# "claim": "c' est quand même très écologique tout ça", +# "note": 1, +# "sentiment": "positif", +# "theme": "écologie urbaine" +# }, +# { +# "claim": "la ville de lyon et très écologiste est en fête des lumières", +# "note": 1, +# "sentiment": "positif", +# "theme": "politique environnementale" +# } +# ] +# } +def parse_json_response_llm_to_claim(response: str, keyword_id:str)-> dict: + logging.info(f"Parse response {response}") + if response is not None: + try: + response = json.loads(response) + logging.info(f"Length responses {response}") + output = [] + # parse it to Claim + for claim in response["claims"]: + logging.info(f"claim : {claim}") + claim_parsed = Claim(claim["claim"], claim["note"], claim["sentiment"], claim["theme"], keyword_id) + output.append(claim_parsed) + + return output + except Exception as e: + logging.error(f"Error during parsing response: {e}") + return [] + else: + logging.warning(f"Empty response - ignore keyword_id: {keyword_id}") + return [] + +# TODO store in PG instead of CSV +# use for now head -n 1 file1.csv > combined.out && tail -n+2 -q *.csv >> combined.out.csv +def save_csv_to_pg(): + path = "/app/llm/output/" + all_files = glob.glob(os.path.join(path , "*.csv")) + logging.info(f"Reading all_files {all_files}") + li = [] + + for filename in all_files: + logging.info(f"Reading file {filename}") + df = pd.read_csv(filename, header=1) + li.append(df) + + frame = pd.concat(li, axis=0, ignore_index=True) + conn = connect_to_db() + save_to_pg(frame, "claims", conn) + conn.close() + + +if __name__ == "__main__": + file_path = "llm/ecologist_claims_raw_data_april_2023_dec_2024.csv" + output_dir = "llm/output" + batch_size = 500 # Batch size + keyword = "écologiste" + + data = read_csv(file_path) + asyncio.run(process_batches(data, batch_size, keyword, output_dir)) + + logging.info("Processing completed.") +